from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import lightgbm as lgb
import pickle
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
from sklearn.tree import plot_tree
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import sys
import pydot
import warnings
warnings.simplefilter(action='ignore')
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn import metrics
1.Import and understand the data.
1.A. Import ‘signal-data.csv’ as DataFrame
df_signal = pd.read_csv('/content/drive/MyDrive/signal-data.csv')
df_signal.head()
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | ... | NaN | 0.5005 | 0.0118 | 0.0035 | 2.3630 | NaN | NaN | NaN | NaN | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | ... | 208.2045 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | ... | 82.8602 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | ... | 73.8432 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | ... | NaN | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
5 rows × 592 columns
print("The Shape of dataframe is", df_signal.shape)
The Shape of dataframe is (1567, 592)
1.B. Print 5 point summary and share at least 2 observations.
df_signal.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1561.0 | 3014.452896 | 73.621787 | 2743.2400 | 2966.260000 | 3011.4900 | 3056.6500 | 3356.3500 |
| 1 | 1560.0 | 2495.850231 | 80.407705 | 2158.7500 | 2452.247500 | 2499.4050 | 2538.8225 | 2846.4400 |
| 2 | 1553.0 | 2200.547318 | 29.513152 | 2060.6600 | 2181.044400 | 2201.0667 | 2218.0555 | 2315.2667 |
| 3 | 1553.0 | 1396.376627 | 441.691640 | 0.0000 | 1081.875800 | 1285.2144 | 1591.2235 | 3715.0417 |
| 4 | 1553.0 | 4.197013 | 56.355540 | 0.6815 | 1.017700 | 1.3168 | 1.5257 | 1114.5366 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 586 | 1566.0 | 0.021458 | 0.012358 | -0.0169 | 0.013425 | 0.0205 | 0.0276 | 0.1028 |
| 587 | 1566.0 | 0.016475 | 0.008808 | 0.0032 | 0.010600 | 0.0148 | 0.0203 | 0.0799 |
| 588 | 1566.0 | 0.005283 | 0.002867 | 0.0010 | 0.003300 | 0.0046 | 0.0064 | 0.0286 |
| 589 | 1566.0 | 99.670066 | 93.891919 | 0.0000 | 44.368600 | 71.9005 | 114.7497 | 737.3048 |
| Pass/Fail | 1567.0 | -0.867262 | 0.498010 | -1.0000 | -1.000000 | -1.0000 | -1.0000 | 1.0000 |
591 rows × 8 columns
Observation:
df_signal['Time'].info()
<class 'pandas.core.series.Series'> RangeIndex: 1567 entries, 0 to 1566 Series name: Time Non-Null Count Dtype -------------- ----- 1567 non-null object dtypes: object(1) memory usage: 12.4+ KB
df_signal['Time'] = pd.to_datetime(df_signal['Time'])
2. Data cleansing:
2.A. Write a for loop which will remove all the features with 20%+ Null values and impute rest with mean of the feature.
df_signal.isna().sum()
Time 0
0 6
1 7
2 14
3 14
..
586 1
587 1
588 1
589 1
Pass/Fail 0
Length: 592, dtype: int64
col_name=df_signal.columns
count=0
col_with_null_nalues = []
for i in col_name:
if (((df_signal[i].isna().sum())/df_signal.shape[1])>0.2):
count=count+1
df_signal.drop(columns=[i], inplace=True) ## Deleting the columns which has more than 20% null values
col_with_null_nalues.append(i)
else :
df_signal[i].fillna(df_signal[i].mean(), inplace=True) #Imputing with mean
print("Numbers of columns deleted =", count)
print("Numbers of columns in the dataset", df_signal.shape[1])
print("Deleted columns:", col_with_null_nalues)
Numbers of columns deleted = 52 Numbers of columns in the dataset 540 Deleted columns: ['72', '73', '85', '109', '110', '111', '112', '157', '158', '220', '244', '245', '246', '247', '292', '293', '345', '346', '358', '382', '383', '384', '385', '492', '516', '517', '518', '519', '546', '547', '548', '549', '550', '551', '552', '553', '554', '555', '556', '557', '562', '563', '564', '565', '566', '567', '568', '569', '578', '579', '580', '581']
df_signal.head()
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 577 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | ... | 14.9509 | 0.5005 | 0.0118 | 0.0035 | 2.3630 | 0.021458 | 0.016475 | 0.005283 | 99.670066 | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | ... | 10.9003 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.009600 | 0.020100 | 0.006000 | 208.204500 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | ... | 9.2721 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.058400 | 0.048400 | 0.014800 | 82.860200 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | ... | 8.5831 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.020200 | 0.014900 | 0.004400 | 73.843200 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | ... | 10.9698 | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.020200 | 0.014900 | 0.004400 | 73.843200 | -1 |
5 rows × 540 columns
2.B. Identify and drop the features which are having same value for all the rows.
col_name2=df_signal.columns
q=[]
for j in col_name2:
if(len(df_signal[j].value_counts())==1):
q.append(j)
df_signal.drop(columns=[j], inplace=True, axis=1)
print("Numbers of column in the dataset", df_signal.shape[1])
print("Columns awith same data in all the rows:", q)
Numbers of column in the dataset 424 Columns awith same data in all the rows: ['5', '13', '42', '49', '52', '69', '97', '141', '149', '178', '179', '186', '189', '190', '191', '192', '193', '194', '226', '229', '230', '231', '232', '233', '234', '235', '236', '237', '240', '241', '242', '243', '256', '257', '258', '259', '260', '261', '262', '263', '264', '265', '266', '276', '284', '313', '314', '315', '322', '325', '326', '327', '328', '329', '330', '364', '369', '370', '371', '372', '373', '374', '375', '378', '379', '380', '381', '394', '395', '396', '397', '398', '399', '400', '401', '402', '403', '404', '414', '422', '449', '450', '451', '458', '461', '462', '463', '464', '465', '466', '481', '498', '501', '502', '503', '504', '505', '506', '507', '508', '509', '512', '513', '514', '515', '528', '529', '530', '531', '532', '533', '534', '535', '536', '537', '538']
print(df_signal.shape)
(1567, 424)
2.C Drop other features if required using relevant functional knowledge. Clearly justify the same.
Check for the duplicate rows
df_signal.duplicated().sum()
0
As mention in the data description of the project, time is used only for data time stamp for that specific test point. So we can drop it.
Target column “ –1” corresponds to a pass and “1” corresponds to a fail. Update the pass as 0 and fail as 1.
df_signal = df_signal.drop('Time', axis=1)
df_signal['Pass/Fail'] = df_signal['Pass/Fail'].replace({ -1:0, 1:1})
df_signal['Pass/Fail'] = df_signal['Pass/Fail'].astype(int)
df_signal.sample(5)
| 0 | 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | ... | 577 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 399 | 2996.53 | 2533.52 | 2202.1222 | 1034.5674 | 0.7760 | 104.6156 | 0.1219 | 1.5320 | 0.0068 | -0.0173 | ... | 8.3030 | 0.5022 | 0.0156 | 0.0037 | 3.1083 | 0.0120 | 0.0104 | 0.0036 | 86.7035 | 0 |
| 44 | 3047.78 | 2490.71 | 2166.5222 | 907.0746 | 1.0647 | 104.5211 | 0.1221 | 1.5764 | -0.0219 | -0.0080 | ... | 8.2142 | 0.5009 | 0.0155 | 0.0041 | 3.0904 | 0.0149 | 0.0158 | 0.0054 | 106.1812 | 0 |
| 730 | 3136.34 | 2442.45 | 2250.7445 | 996.4071 | 0.8572 | 106.2956 | 0.1172 | 1.4285 | -0.0222 | -0.0006 | ... | 13.2632 | 0.5022 | 0.0173 | 0.0040 | 3.4516 | 0.0335 | 0.0084 | 0.0030 | 25.1494 | 0 |
| 618 | 2993.11 | 2498.91 | 2171.8556 | 940.9917 | 1.2906 | 103.4733 | 0.1234 | 1.4701 | -0.0181 | -0.0010 | ... | 14.0761 | 0.4959 | 0.0126 | 0.0037 | 2.5402 | 0.0227 | 0.0149 | 0.0052 | 65.4831 | 0 |
| 651 | 2978.62 | 2478.81 | 2236.0667 | 1680.1825 | 1.4834 | 98.6889 | 0.1221 | 1.4149 | -0.0045 | 0.0085 | ... | 7.6125 | 0.4993 | 0.0121 | 0.0029 | 2.4170 | 0.0297 | 0.0115 | 0.0040 | 38.7106 | 0 |
5 rows × 423 columns
Check for zero standard deviation
X = df_signal.drop('Pass/Fail',axis=1)
#Test data
y=df_signal['Pass/Fail']
#check zero stanadard deviation in the columns
zero_std_columns = []
for column in X.columns:
std_dev = X[column].std()
std_dev= std_dev.round()
if std_dev == 0:
zero_std_columns.append(column)
X = X.drop(zero_std_columns, axis=1)
print("Columns with zero standard deviation so deleting it:", zero_std_columns)
print("Numbers of column in the dataset", X.shape[1])
Columns with zero standard deviation so deleting it: ['7', '8', '9', '10', '11', '17', '19', '20', '25', '26', '29', '30', '37', '38', '44', '47', '53', '54', '56', '57', '58', '61', '74', '75', '76', '77', '78', '79', '80', '81', '82', '84', '86', '87', '89', '91', '92', '93', '94', '95', '96', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '113', '114', '116', '118', '119', '120', '121', '123', '124', '125', '126', '127', '128', '130', '131', '132', '143', '144', '145', '146', '147', '153', '156', '163', '164', '165', '168', '169', '170', '171', '172', '173', '174', '175', '176', '177', '181', '184', '195', '206', '210', '211', '212', '213', '214', '215', '216', '217', '219', '221', '222', '224', '227', '228', '238', '239', '248', '249', '251', '253', '254', '255', '267', '278', '279', '280', '281', '282', '288', '290', '291', '298', '299', '300', '301', '302', '303', '304', '305', '306', '307', '308', '309', '310', '311', '312', '317', '320', '331', '334', '342', '347', '348', '349', '350', '351', '352', '353', '354', '355', '356', '357', '359', '360', '362', '365', '366', '367', '368', '376', '377', '386', '387', '389', '391', '392', '393', '405', '407', '441', '443', '444', '445', '446', '447', '448', '542', '543', '544', '558', '559', '560', '571', '573', '575', '582', '583', '584', '586', '587', '588'] Numbers of column in the dataset 229
Deleted the columns which has zero standar deviation. Because if any feature contains zero stanad deviation means it is Non-Informative Feature. And non-informative features can lead to overfitting.
2.D Check for multi-collinearity in the data and take necessary action. 2.E Make all relevant modifications on the data using both functional/logical reasoning/assumptions.
X.corr()
| 0 | 1 | 2 | 3 | 4 | 6 | 12 | 14 | 15 | 16 | ... | 541 | 545 | 561 | 570 | 572 | 574 | 576 | 577 | 585 | 589 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.000000 | -0.143840 | 0.004756 | -0.007613 | -0.011014 | 0.002270 | 0.010368 | -0.007058 | 0.030675 | -0.005749 | ... | 0.034221 | -0.015287 | 0.037917 | -0.018953 | 0.013678 | 0.015206 | 0.013228 | 0.008601 | 0.023589 | 0.004174 |
| 1 | -0.143840 | 1.000000 | 0.005767 | -0.007568 | -0.001636 | -0.025564 | 0.034062 | -0.037667 | -0.087315 | -0.001878 | ... | -0.015439 | 0.040333 | -0.025492 | -0.009000 | 0.001753 | 0.001303 | 0.002570 | -0.010145 | 0.002273 | 0.044797 |
| 2 | 0.004756 | 0.005767 | 1.000000 | 0.298935 | 0.095891 | -0.136225 | 0.018326 | 0.006476 | 0.006115 | -0.000788 | ... | -0.004180 | 0.025334 | 0.025862 | -0.037070 | -0.000518 | 0.001342 | 0.002592 | -0.028705 | 0.015752 | -0.032890 |
| 3 | -0.007613 | -0.007568 | 0.298935 | 1.000000 | -0.058483 | -0.685835 | -0.028223 | -0.019827 | -0.013157 | -0.004596 | ... | 0.024721 | 0.046897 | 0.014912 | 0.002231 | 0.007634 | 0.006822 | 0.008216 | 0.016438 | 0.026019 | -0.080341 |
| 4 | -0.011014 | -0.001636 | 0.095891 | -0.058483 | 1.000000 | -0.074368 | -0.002707 | -0.017523 | 0.011435 | -0.001763 | ... | -0.044442 | 0.057173 | -0.025806 | 0.005273 | -0.012024 | -0.012264 | -0.012163 | -0.004070 | -0.001616 | 0.050910 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 574 | 0.015206 | 0.001303 | 0.001342 | 0.006822 | -0.012264 | 0.007783 | 0.032908 | 0.000409 | -0.024032 | -0.014005 | ... | 0.033459 | 0.060316 | -0.046796 | -0.307529 | 0.993689 | 1.000000 | 0.991738 | 0.851784 | -0.016812 | -0.020471 |
| 576 | 0.013228 | 0.002570 | 0.002592 | 0.008216 | -0.012163 | 0.007409 | 0.035743 | -0.000985 | -0.023509 | -0.014167 | ... | 0.033358 | 0.056812 | -0.046402 | -0.360498 | 0.994772 | 0.991738 | 1.000000 | 0.859278 | -0.017147 | -0.022567 |
| 577 | 0.008601 | -0.010145 | -0.028705 | 0.016438 | -0.004070 | -0.012342 | 0.031434 | 0.009505 | -0.019152 | -0.004396 | ... | 0.051837 | 0.061725 | -0.052455 | -0.247655 | 0.863768 | 0.851784 | 0.859278 | 1.000000 | -0.023910 | -0.024766 |
| 585 | 0.023589 | 0.002273 | 0.015752 | 0.026019 | -0.001616 | -0.039517 | 0.000523 | 0.002535 | 0.017745 | 0.002643 | ... | 0.009112 | -0.025979 | 0.023803 | 0.010143 | -0.017179 | -0.016812 | -0.017147 | -0.023910 | 1.000000 | -0.003800 |
| 589 | 0.004174 | 0.044797 | -0.032890 | -0.080341 | 0.050910 | 0.043777 | -0.036720 | 0.068161 | 0.009764 | -0.013918 | ... | -0.070310 | -0.061836 | 0.020439 | -0.010583 | -0.022672 | -0.020471 | -0.022567 | -0.024766 | -0.003800 | 1.000000 |
229 rows × 229 columns
Dropping the columns which has equal and more than 95% correlation between features.
correlation_matrix = X.corr()
threshold = 0.95
almost_same_correlations = {}
s=set()
# Iterate through the correlation matrix
for i in range(len(correlation_matrix.columns)):
for j in range(i):
correlation = correlation_matrix.iloc[i, j]
if abs(correlation) >= threshold:
col1 = correlation_matrix.columns[i]
col2 = correlation_matrix.columns[j]
if col1 not in almost_same_correlations:
almost_same_correlations[col1] = [col2]
else:
almost_same_correlations[col1].append(col2)
print(almost_same_correlations)
# Extract the values (lists of column names) and flatten them
columns_to_drop = []
for cols in almost_same_correlations.values():
for col in cols:
columns_to_drop.append(col)
# Drop the columns from the DataFrame
X = X.drop(columns=columns_to_drop, axis=1)
{'36': ['34'], '140': ['4'], '148': ['16'], '152': ['16', '148'], '252': ['117'], '271': ['136'], '272': ['137'], '274': ['139'], '275': ['4', '140'], '277': ['142'], '283': ['16', '148', '152'], '285': ['150'], '286': ['151'], '287': ['16', '148', '152', '283'], '289': ['154'], '294': ['159'], '295': ['160'], '296': ['161'], '297': ['162'], '318': ['182'], '319': ['183'], '321': ['185'], '323': ['187'], '324': ['188'], '332': ['196'], '333': ['197'], '335': ['199', '332'], '338': ['202'], '339': ['203'], '340': ['204'], '341': ['205'], '343': ['207'], '344': ['208'], '361': ['223'], '363': ['225'], '388': ['250'], '390': ['117', '252'], '406': ['268'], '408': ['135'], '409': ['136', '271'], '410': ['137', '272'], '411': ['138'], '415': ['142', '277'], '421': ['16', '148', '152', '154', '283', '287', '289'], '424': ['151', '286'], '425': ['148', '152', '283', '287', '421'], '427': ['148', '154', '283', '289', '421'], '428': ['155'], '435': ['430', '434'], '436': ['430', '434', '435'], '437': ['166'], '452': ['180'], '454': ['182', '318'], '455': ['183', '319'], '457': ['185', '321'], '459': ['187', '323'], '469': ['197', '333'], '470': ['198'], '475': ['203', '339'], '477': ['205', '341'], '478': ['209'], '479': ['207', '343'], '490': ['218'], '495': ['223', '361'], '497': ['225', '363'], '522': ['250', '388'], '524': ['117', '252', '390'], '540': ['268', '406'], '541': ['269'], '574': ['572'], '576': ['572', '574']}
Dropping the features that have less correlation with target Pass/Fail column.
#combining the independent and dependent vaiables tocheck corrlation with target variable
X['Pass/Fail']=y
plt.figure(figsize=(12, 10))
sns.heatmap(X.corr()[['Pass/Fail']].sort_values(by='Pass/Fail', ascending=True), cmap='coolwarm', linewidths=.5,fmt=".2f", vmin=-1, vmax=1 )
plt.title("Correlation Heatmap")
plt.show()
X.shape
(1567, 157)
Dropping the column which has less than .06 correlation .
correlation_threshold = 0.06
target_variable = 'Pass/Fail'
columns_to_drop = []
for column in X.columns:
if column != target_variable:
correlation = X[column].corr(X[target_variable])
if abs(correlation) < correlation_threshold:
columns_to_drop.append(column)
X = X.drop(columns_to_drop, axis=1)
X.head()
| 14 | 21 | 22 | 28 | 33 | 59 | 64 | 122 | 129 | 133 | ... | 294 | 295 | 316 | 431 | 436 | 437 | 452 | 460 | 510 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.9558 | -5419.00 | 2916.50 | 64.2333 | 9.5126 | -1.7264 | 21.7264 | 2.639 | -0.0473 | 1000.7263 | ... | 418.1363 | 398.3185 | 6.2698 | 33.1562 | 3.1158 | 3.1136 | 5.9396 | 29.9394 | 64.6707 | 0 |
| 1 | 10.1548 | -5441.50 | 2604.25 | 68.4222 | 9.7997 | 0.8073 | 19.1927 | 2.541 | -0.0946 | 998.1081 | ... | 233.9865 | 26.5879 | 5.6522 | 2.2655 | 1.6779 | 3.2153 | 5.1072 | 40.4475 | 141.4365 | 0 |
| 2 | 9.5157 | -5447.75 | 2701.75 | 67.1333 | 8.6590 | 23.8245 | 16.1755 | 2.882 | -0.1892 | 998.4440 | ... | 251.4536 | 329.6406 | 5.7247 | 29.1663 | 0.8972 | 3.1281 | 4.8795 | 32.3594 | 240.7767 | 1 |
| 3 | 9.6052 | -5468.25 | 2648.25 | 62.9333 | 8.6789 | 24.3791 | 15.6209 | 3.132 | 0.2838 | 980.4510 | ... | 415.5048 | 157.0889 | 5.4440 | 13.4051 | 1.3671 | 2.7013 | 4.4680 | 27.6824 | 113.5593 | 0 |
| 4 | 10.5661 | -5476.25 | 2635.25 | 62.8333 | 8.7677 | -12.2945 | 32.2945 | 3.148 | -0.5677 | 993.1274 | ... | 319.1252 | 128.0296 | 4.8956 | 10.7390 | 1.5533 | 6.2069 | 4.3131 | 30.8924 | 148.0663 | 0 |
5 rows × 21 columns
#listing all the correlation with target.
a=X.corr()
a['Pass/Fail']
14 -0.068975 21 0.107997 22 -0.073380 28 -0.106767 33 0.080945 59 0.155771 64 0.076551 122 -0.078362 129 0.103351 133 0.067789 200 0.060595 294 0.081761 295 0.091831 316 -0.089410 431 0.120304 436 0.106426 437 0.069692 452 -0.077100 460 0.060587 510 0.131587 Pass/Fail 1.000000 Name: Pass/Fail, dtype: float64
y=X['Pass/Fail']
X=X.drop('Pass/Fail',axis=1)
Apply forward feature selection.
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
sfs1 = SFS(DecisionTreeClassifier(random_state=1),
k_features=10,
forward=True,
floating=False,
verbose=2,
scoring='r2')
sfs1 = sfs1.fit(np.array(X_train), y_train)
[2023-10-22 08:32:14] Features: 1/10 -- score: -0.0711083437110834 [2023-10-22 08:32:14] Features: 2/10 -- score: -0.3350809464508094 [2023-10-22 08:32:15] Features: 3/10 -- score: -0.3074252801992528 [2023-10-22 08:32:15] Features: 4/10 -- score: -0.8295662100456619 [2023-10-22 08:32:15] Features: 5/10 -- score: -0.6444146948941469 [2023-10-22 08:32:16] Features: 6/10 -- score: -0.607491697799917 [2023-10-22 08:32:17] Features: 7/10 -- score: -0.720601909506019 [2023-10-22 08:32:17] Features: 8/10 -- score: -0.6762162723121626 [2023-10-22 08:32:18] Features: 9/10 -- score: -0.585640307181403 [2023-10-22 08:32:18] Features: 10/10 -- score: -0.6460730593607306
Creating a new dataframe with selected 10 independent features.
new_df = X.iloc[:, list(sfs1.k_feature_idx_)]
new_df['Pass/Fail'] = y
new_df.head()
| 28 | 59 | 64 | 122 | 129 | 133 | 200 | 295 | 316 | 460 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 64.2333 | -1.7264 | 21.7264 | 2.639 | -0.0473 | 1000.7263 | 10.30 | 398.3185 | 6.2698 | 29.9394 | 0 |
| 1 | 68.4222 | 0.8073 | 19.1927 | 2.541 | -0.0946 | 998.1081 | 8.02 | 26.5879 | 5.6522 | 40.4475 | 0 |
| 2 | 67.1333 | 23.8245 | 16.1755 | 2.882 | -0.1892 | 998.4440 | 16.73 | 329.6406 | 5.7247 | 32.3594 | 1 |
| 3 | 62.9333 | 24.3791 | 15.6209 | 3.132 | 0.2838 | 980.4510 | 13.56 | 157.0889 | 5.4440 | 27.6824 | 0 |
| 4 | 62.8333 | -12.2945 | 32.2945 | 3.148 | -0.5677 | 993.1274 | 19.77 | 128.0296 | 4.8956 | 30.8924 | 0 |
3. Data analysis & visualisation
3.A Perform a detailed univariate Analysis with appropriate detailed comments after each analysis.
def hist_plot(df):
cat_columns = df.columns
ncols = 3
nrows = len(cat_columns) // ncols + (len(cat_columns) % ncols > 0)
plt.figure(figsize=(30,30))
for i, j in enumerate(cat_columns):
plt.subplot(nrows, ncols, i+1)
ax=sns.histplot(x=j,data=df,hue='Pass/Fail')
ax.set_title("Histogram of "+j)
return plt.show()
new_df['Pass/Fail'] = new_df['Pass/Fail'].astype('category')
new_df['Pass/Fail'].value_counts()
0 1463 1 104 Name: Pass/Fail, dtype: int64
new_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 28 1567 non-null float64 1 59 1567 non-null float64 2 64 1567 non-null float64 3 122 1567 non-null float64 4 129 1567 non-null float64 5 133 1567 non-null float64 6 200 1567 non-null float64 7 295 1567 non-null float64 8 316 1567 non-null float64 9 460 1567 non-null float64 10 Pass/Fail 1567 non-null category dtypes: category(1), float64(10) memory usage: 124.2 KB
hist_plot(new_df)
def box_plot(df):
cat_columns = []
for i in df.columns:
if (((df[i].dtype.name) == 'float64') ):
cat_columns.append(i)
ncols = 3
nrows = len(cat_columns) // ncols + (len(cat_columns) % ncols > 0)
plt.figure(figsize=(30,30))
for i, j in enumerate(cat_columns):
plt.subplot(nrows, ncols, i+1)
ax=sns.boxplot(X[j],color='blue')
ax.set_title("Histogram of "+j)
return plt.show()
box_plot(new_df)
Analysis:
Adjust Outliers.
for i in new_df.columns:
if (((new_df[i].dtype.name) == 'float64') ):
Q1=X[i].quantile(0.25)
Q3=X[i].quantile(0.75)
IQR=Q3-Q1
lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR
median = X[i].median()
new_df[i][new_df[i] < lower] = median
new_df[i][new_df[i] > upper] = median
Perform bivariate and multivariate analysis with appropriate detailed comments after each analysis
def bar_plot(df):
cat_columns = []
for i in df.columns:
if (((df[i].dtype.name) == 'float64')):
cat_columns.append(i)
ncols = 3
nrows = len(cat_columns) // ncols + (len(cat_columns) % ncols > 0)
plt.figure(figsize=(30,30))
for i, j in enumerate(cat_columns):
plt.subplot(nrows, ncols, i + 1)
ax = sns.barplot(x=j, y='Pass/Fail', data=df)
ax.set_title("Bar Plot of " + j)
plt.tight_layout()
plt.show()
bar_plot(new_df)
def scatter_plot(df):
cat_columns = []
for i in df.columns:
if (((df[i].dtype.name) == 'float64')):
cat_columns.append(i)
ncols = 3
nrows = len(cat_columns) // ncols + (len(cat_columns) % ncols > 0)
plt.figure(figsize=(30,30))
for i, j in enumerate(cat_columns):
plt.subplot(nrows, ncols, i + 1)
ax = sns.pointplot(x=j, y='Pass/Fail', data=df)
ax.set_title("scatter Plot of " + j)
plt.tight_layout()
plt.show()
scatter_plot(new_df)
Analysis -
Multi Variate plot
sns.pairplot(new_df, hue ='Pass/Fail', corner=True)
<seaborn.axisgrid.PairGrid at 0x7aa966951c60>
sns.heatmap(new_df.corr(),vmin=-1, vmax=1 )
plt.title("Correlation Heatmap")
plt.show()
Analysis:
4. Data pre-processing
4.A Segregate predictors vs target attributes
predictor = new_df.drop('Pass/Fail',axis=1)
target=new_df['Pass/Fail']
target.value_counts()
0 1463 1 104 Name: Pass/Fail, dtype: int64
The Target is not balanced.
#RandomOverSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
ROS = RandomOverSampler()
X_ros1, y_ros1 = ROS.fit_resample(predictor,target)
X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(X_ros1, y_ros1, test_size=0.25, random_state=1)
print('Original dataset shape {}'.format(Counter(target)))
print('After Over sampling shape {}'.format(Counter(y_ros1)))
print('Resampled dataset shape {}'.format(Counter(y_train_ros)))
print('Resampled dataset shape {}'.format(Counter(y_test_ros)))
Original dataset shape Counter({0: 1463, 1: 104})
After Over sampling shape Counter({0: 1463, 1: 1463})
Resampled dataset shape Counter({1: 1101, 0: 1093})
Resampled dataset shape Counter({0: 370, 1: 362})
#RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler()
X_train_rus, y_train_rus = undersample.fit_resample(predictor,target)
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_train_rus, y_train_rus, test_size=0.25, random_state=5)
print('Original dataset shape {}'.format(Counter(target)))
print('After Over sampling shape {}'.format(Counter(y_train_rus)))
print('Resampled dataset shape {}'.format(Counter(y_train_rus)))
print('Resampled dataset shape {}'.format(Counter(y_test_rus)))
Original dataset shape Counter({0: 1463, 1: 104})
After Over sampling shape Counter({0: 83, 1: 73})
Resampled dataset shape Counter({0: 83, 1: 73})
Resampled dataset shape Counter({1: 31, 0: 21})
Scaling
X_train_rus_scaled = X_train_rus.apply(zscore)
X_train_rus_scaled.sample(5)
| 28 | 59 | 64 | 122 | 129 | 133 | 200 | 295 | 316 | 460 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 49 | -1.664801 | -2.413144 | 2.106716 | -0.727778 | 0.095865 | -0.069291 | -0.477063 | -1.179800 | 0.947756 | -0.928872 |
| 43 | 1.137049 | -0.337824 | 0.000450 | 1.100253 | 0.250467 | 0.327205 | 2.247847 | -0.792831 | -0.245084 | -0.730333 |
| 10 | 0.188612 | 0.305372 | 0.219912 | -0.392921 | 0.173166 | -1.282260 | -0.033754 | 2.828865 | -0.233635 | 1.687242 |
| 97 | 0.415713 | -0.042034 | -0.299752 | 0.665070 | -0.213339 | -1.232266 | 1.037035 | -0.298007 | 0.685225 | -0.486726 |
| 30 | -0.950167 | -0.182126 | -2.163097 | -0.512792 | 1.410145 | 0.716021 | -1.060306 | 0.826613 | -1.186502 | -0.190533 |
X_train_ros_scaled = X_train_ros.apply(zscore)
X_train_ros_scaled.sample(5)
| 28 | 59 | 64 | 122 | 129 | 133 | 200 | 295 | 316 | 460 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2168 | 0.595146 | -1.106932 | 0.793389 | -1.221683 | -2.848394 | -1.542294 | -0.377662 | -0.031744 | 0.511652 | -1.579579 |
| 1797 | -0.155191 | -0.114409 | -1.192452 | -1.397890 | 0.123528 | 1.151438 | 0.433035 | -1.065471 | -1.871358 | -0.233724 |
| 2819 | -0.705625 | -0.114409 | 1.625472 | -0.628075 | 1.015403 | -0.487242 | 0.171676 | 0.308400 | 1.793004 | 0.826359 |
| 2857 | -0.250203 | -0.114409 | 0.460314 | -0.790536 | 0.272148 | 0.192397 | -1.505377 | -0.982130 | -0.740909 | -0.005067 |
| 1219 | -0.466443 | -0.714622 | 0.398440 | -1.030479 | -0.025091 | -1.158400 | 0.125696 | -1.562560 | -0.013861 | -0.527385 |
4.D Check if the train and test data have similar statistical characteristics when compared with original data.
X_train_ros_scaled.describe()
| 28 | 59 | 64 | 122 | 129 | 133 | 200 | 295 | 316 | 460 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.194000e+03 | 2.194000e+03 | 2.194000e+03 | 2.194000e+03 | 2.194000e+03 | 2.194000e+03 | 2.194000e+03 | 2.194000e+03 | 2.194000e+03 | 2.194000e+03 |
| mean | -3.037781e-15 | 3.238572e-18 | 6.962930e-17 | -7.740188e-16 | 6.477144e-18 | -3.270958e-15 | 3.400501e-16 | 1.465454e-16 | 6.517627e-16 | -6.477144e-17 |
| std | 1.000228e+00 | 1.000228e+00 | 1.000228e+00 | 1.000228e+00 | 1.000228e+00 | 1.000228e+00 | 1.000228e+00 | 1.000228e+00 | 1.000228e+00 | 1.000228e+00 |
| min | -2.576529e+00 | -3.021212e+00 | -2.531580e+00 | -2.617598e+00 | -3.442244e+00 | -3.047521e+00 | -2.964631e+00 | -2.469624e+00 | -2.370397e+00 | -2.058927e+00 |
| 25% | -5.975201e-01 | -4.685990e-01 | -6.817915e-01 | -7.839754e-01 | -4.711075e-01 | -6.418430e-01 | -6.626157e-01 | -6.672683e-01 | -6.321166e-01 | -6.746673e-01 |
| 50% | -5.032971e-02 | -1.144091e-01 | 4.984731e-02 | -1.406923e-01 | 1.235283e-01 | -5.021698e-02 | -5.096330e-02 | -1.051319e-01 | -8.589300e-02 | -2.234368e-01 |
| 75% | 6.303629e-01 | 3.897467e-01 | 5.786758e-01 | 6.703631e-01 | 4.950775e-01 | 8.315572e-01 | 6.284489e-01 | 6.802064e-01 | 5.257164e-01 | 5.516281e-01 |
| max | 2.669168e+00 | 3.033921e+00 | 2.753641e+00 | 2.918573e+00 | 2.427133e+00 | 2.739132e+00 | 2.732025e+00 | 3.255473e+00 | 2.891353e+00 | 2.961686e+00 |
X_test_ros_scaled = X_test_ros.apply(zscore)
X_test_ros_scaled.describe()
| 28 | 59 | 64 | 122 | 129 | 133 | 200 | 295 | 316 | 460 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 7.320000e+02 | 7.320000e+02 | 7.320000e+02 | 7.320000e+02 | 7.320000e+02 | 7.320000e+02 | 7.320000e+02 | 7.320000e+02 | 7.320000e+02 | 7.320000e+02 |
| mean | 1.004661e-15 | 3.882747e-17 | 5.581449e-16 | -1.613767e-16 | 9.706868e-18 | -1.344644e-14 | -4.222488e-16 | -3.621875e-16 | 1.048342e-15 | 4.610762e-17 |
| std | 1.000684e+00 | 1.000684e+00 | 1.000684e+00 | 1.000684e+00 | 1.000684e+00 | 1.000684e+00 | 1.000684e+00 | 1.000684e+00 | 1.000684e+00 | 1.000684e+00 |
| min | -2.493913e+00 | -2.865051e+00 | -2.358150e+00 | -2.703983e+00 | -3.373055e+00 | -2.859493e+00 | -2.839141e+00 | -2.214331e+00 | -2.321334e+00 | -1.956372e+00 |
| 25% | -5.294866e-01 | -5.095051e-01 | -6.678818e-01 | -8.463462e-01 | -5.324393e-01 | -6.553393e-01 | -6.911868e-01 | -7.600053e-01 | -6.389683e-01 | -6.586029e-01 |
| 50% | -2.639759e-02 | -1.420608e-01 | 1.807078e-02 | -3.500927e-02 | 1.235258e-01 | -8.915859e-02 | 1.229159e-02 | -4.971446e-02 | -1.108107e-01 | -2.106576e-01 |
| 75% | 6.555514e-01 | 3.873871e-01 | 5.802138e-01 | 6.685821e-01 | 4.149973e-01 | 7.951738e-01 | 6.600779e-01 | 7.363206e-01 | 5.444727e-01 | 5.011955e-01 |
| max | 2.358038e+00 | 2.748725e+00 | 2.645270e+00 | 3.027301e+00 | 2.382430e+00 | 2.712791e+00 | 2.704269e+00 | 2.842560e+00 | 2.980978e+00 | 2.946360e+00 |
X_train_rus_scaled.describe()
| 28 | 59 | 64 | 122 | 129 | 133 | 200 | 295 | 316 | 460 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.560000e+02 | 1.560000e+02 | 1.560000e+02 | 1.560000e+02 | 1.560000e+02 | 1.560000e+02 | 1.560000e+02 | 1.560000e+02 | 1.560000e+02 | 1.560000e+02 |
| mean | 3.905708e-15 | -7.401487e-17 | 5.978124e-17 | 7.970832e-17 | 4.554761e-17 | -5.921189e-16 | 3.273735e-17 | -7.187982e-17 | 8.540177e-16 | 2.846726e-16 |
| std | 1.003221e+00 | 1.003221e+00 | 1.003221e+00 | 1.003221e+00 | 1.003221e+00 | 1.003221e+00 | 1.003221e+00 | 1.003221e+00 | 1.003221e+00 | 1.003221e+00 |
| min | -2.593190e+00 | -2.541188e+00 | -2.304132e+00 | -2.331703e+00 | -3.381372e+00 | -2.836720e+00 | -2.569738e+00 | -2.118678e+00 | -2.062627e+00 | -1.938437e+00 |
| 25% | -5.736220e-01 | -3.892945e-01 | -6.483117e-01 | -8.079093e-01 | -5.227067e-01 | -6.472617e-01 | -6.246233e-01 | -7.648309e-01 | -6.646130e-01 | -6.890714e-01 |
| 50% | -5.183300e-02 | -1.821258e-01 | -9.624247e-03 | -6.979107e-02 | 9.586462e-02 | -6.377202e-02 | -7.812490e-02 | -4.171869e-03 | -1.901067e-01 | -2.253345e-01 |
| 75% | 6.720203e-01 | 3.160072e-01 | 5.146136e-01 | 7.432465e-01 | 4.050686e-01 | 7.650487e-01 | 6.526782e-01 | 6.252778e-01 | 5.598536e-01 | 3.850439e-01 |
| max | 2.118848e+00 | 2.904413e+00 | 2.772568e+00 | 2.837079e+00 | 2.492195e+00 | 2.581711e+00 | 2.653784e+00 | 2.828865e+00 | 2.651490e+00 | 3.009957e+00 |
p=predictor.apply(zscore)
p.describe()
| 28 | 59 | 64 | 122 | 129 | 133 | 200 | 295 | 316 | 460 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.567000e+03 | 1567.000000 | 1.567000e+03 | 1.567000e+03 | 1.567000e+03 | 1.567000e+03 | 1.567000e+03 | 1.567000e+03 | 1.567000e+03 | 1.567000e+03 |
| mean | -1.414737e-15 | 0.000000 | -8.161946e-17 | -3.355467e-16 | 1.813766e-17 | 1.988794e-14 | -5.259921e-16 | 2.902025e-16 | -3.627532e-17 | -3.627532e-16 |
| std | 1.000319e+00 | 1.000319 | 1.000319e+00 | 1.000319e+00 | 1.000319e+00 | 1.000319e+00 | 1.000319e+00 | 1.000319e+00 | 1.000319e+00 | 1.000319e+00 |
| min | -2.766770e+00 | -3.048239 | -2.668195e+00 | -2.598893e+00 | -3.326342e+00 | -2.804903e+00 | -2.762710e+00 | -2.234737e+00 | -2.504976e+00 | -1.911839e+00 |
| 25% | -6.251964e-01 | -0.597686 | -6.708041e-01 | -7.809149e-01 | -5.028984e-01 | -6.402378e-01 | -6.900538e-01 | -7.603113e-01 | -6.696210e-01 | -7.176864e-01 |
| 50% | -1.114003e-01 | 0.100060 | 2.097773e-02 | 2.298543e-02 | 9.193274e-02 | -4.919240e-02 | 1.653343e-02 | -1.805165e-02 | -9.412606e-02 | -1.389204e-01 |
| 75% | 7.987121e-01 | 0.497567 | 6.100452e-01 | 5.953340e-01 | 3.892697e-01 | 7.117756e-01 | 6.642384e-01 | 7.245946e-01 | 5.765615e-01 | 5.785033e-01 |
| max | 2.455863e+00 | 3.453976 | 2.854026e+00 | 2.661488e+00 | 2.544963e+00 | 2.738357e+00 | 2.805198e+00 | 3.132479e+00 | 2.735286e+00 | 2.937623e+00 |
X_test_rus_scaled = X_test_rus.apply(zscore)
X_test_rus_scaled.describe()
| 28 | 59 | 64 | 122 | 129 | 133 | 200 | 295 | 316 | 460 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 5.200000e+01 | 5.200000e+01 | 5.200000e+01 | 5.200000e+01 | 5.200000e+01 | 5.200000e+01 | 5.200000e+01 | 5.200000e+01 | 5.200000e+01 | 5.200000e+01 |
| mean | -1.938620e-15 | -8.807058e-17 | 4.056584e-17 | -1.216975e-16 | -8.273297e-18 | -7.899664e-16 | -6.138252e-16 | 3.373370e-16 | 3.031763e-16 | -1.281027e-16 |
| std | 1.009756e+00 | 1.009756e+00 | 1.009756e+00 | 1.009756e+00 | 1.009756e+00 | 1.009756e+00 | 1.009756e+00 | 1.009756e+00 | 1.009756e+00 | 1.009756e+00 |
| min | -2.074250e+00 | -2.499621e+00 | -2.031822e+00 | -1.935863e+00 | -3.136782e+00 | -2.338093e+00 | -2.243822e+00 | -1.953942e+00 | -1.591972e+00 | -1.565324e+00 |
| 25% | -6.915126e-01 | -3.941518e-01 | -7.277035e-01 | -7.033311e-01 | -3.753652e-01 | -6.707714e-01 | -7.975893e-01 | -5.687025e-01 | -8.261444e-01 | -6.713383e-01 |
| 50% | -1.545828e-01 | -1.045992e-01 | -2.342004e-02 | -1.041162e-01 | 1.690075e-01 | -2.002836e-01 | 8.802193e-03 | -1.504846e-01 | -1.235402e-01 | -1.725936e-01 |
| 75% | 6.488227e-01 | 4.944892e-01 | 6.695891e-01 | 7.967360e-01 | 5.923753e-01 | 7.341302e-01 | 7.666664e-01 | 6.861950e-01 | 6.405214e-01 | 6.957530e-01 |
| max | 2.152662e+00 | 2.619401e+00 | 2.197226e+00 | 1.992324e+00 | 2.588252e+00 | 1.954345e+00 | 2.016993e+00 | 2.719871e+00 | 2.439800e+00 | 3.003254e+00 |
Analysis: (RandomOverSample,randomUnderSample,oriinal dataset)
5 Model training, testing and tuning
5.A Use any Supervised Learning technique to train a model
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
def test_train_analysis(ytrain,ytest,predict_train,predict_test):
train=performance_analysis(ytrain,predict_train)
test=performance_analysis(ytest,predict_test)
data= { 'train' : train,
'test' :test
}
Name= ['Accuracy', 'Recall', 'Precision', 'F1-score',"roc_auc_score"]
index=Name
df2 = pd.DataFrame(data, index)
df2.reset_index(inplace = True)
display(df2)
def conf_metrix(y,pred):
cm = metrics.confusion_matrix(y, pred, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["Fail","Pass"]],
columns = [i for i in ["Fail","Pass"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True,fmt= 'd')
plt.show()
def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
lb = preprocessing.LabelBinarizer()
lb.fit(y_test)
y_test = lb.transform(y_test)
y_pred = lb.transform(y_pred)
return roc_auc_score(y_test, y_pred, average=average)
def complete_analysis(X_train,X_test,y_train,y_test,ML):
pred_train = ML.predict(X_train)
pred_test = ML.predict(X_test)
test_train_analysis(y_train,y_test,pred_train,pred_test)
conf_metrix(y_test,pred_test )
print("Classification report on training data=================================")
print(classification_report(y_train,pred_train ))
print("Classification report on test data=================================")
print(classification_report(y_test,pred_test ))
def performance_analysis(a,b):
q=[]
q.append(accuracy_score(a, b))
q.append(precision_score(a, b,average="macro"))
q.append(recall_score(a, b,average="macro"))
q.append(f1_score(a, b,average="macro"))
q.append(multiclass_roc_auc_score(a, b, average="macro"))
return q
def summary_table(models,xtrain,xtest,ytrain,ytest):
df_S = pd.DataFrame(index=['Accuracy', 'Recall', 'Precision', 'F1-score',"roc_auc_score"])
for i in models:
x=performance_analysis(ytrain,i.predict(xtrain))
df_S [str(i)[0:9]+"_Train"] = x
y=performance_analysis(ytest,i.predict(xtest))
df_S [str(i)[0:9]+"_Test"]=y
return df_S
def dtree_view(ML,train_x):
fn = list(train_x)
cn = ['No', 'Yes']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4, 4), dpi=300)
plot_tree(ML, feature_names = fn, class_names=cn, filled = True)
def dtree_ml(xtrain,xtest,ytrain,ytest):
dTree1 = DecisionTreeClassifier(random_state=1)
dTree1.fit(xtrain, ytrain)
complete_analysis(xtrain,xtest,ytrain,ytest,dTree1)
print("Top 5 features are************************************************")
feature_scores = pd.Series(dTree1.feature_importances_, index=xtrain.columns).sort_values(ascending=False)
print(feature_scores.head())
dtree_view(dTree1,xtrain)
return dTree1
# def dtree_view(ML,train_x):
# fn = list(train_x)
# cn = ['No', 'Yes']
# fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4, 4), dpi=300)
# plot_tree(ML, feature_names = fn, class_names=cn, filled = True)
Performance analysis on RandomUnder sample dataset using decision tree.
dtree_ml(X_train_rus_scaled,X_test_rus_scaled,y_train_rus, y_test_rus)
| index | train | test | |
|---|---|---|---|
| 0 | Accuracy | 1.0 | 0.557692 |
| 1 | Recall | 1.0 | 0.565476 |
| 2 | Precision | 1.0 | 0.567588 |
| 3 | F1-score | 1.0 | 0.556215 |
| 4 | roc_auc_score | 1.0 | 0.567588 |
Classification report on training data=================================
precision recall f1-score support
0 1.00 1.00 1.00 83
1 1.00 1.00 1.00 73
accuracy 1.00 156
macro avg 1.00 1.00 1.00 156
weighted avg 1.00 1.00 1.00 156
Classification report on test data=================================
precision recall f1-score support
0 0.46 0.62 0.53 21
1 0.67 0.52 0.58 31
accuracy 0.56 52
macro avg 0.57 0.57 0.56 52
weighted avg 0.58 0.56 0.56 52
Top 5 features are************************************************
59 0.167229
460 0.143660
129 0.136141
316 0.125753
200 0.104427
dtype: float64
DecisionTreeClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(random_state=1)
Performance analysis on RandomOver sample dataset using decision tree.
dtree_ml(X_train_ros_scaled,X_test_ros_scaled,y_train_ros, y_test_ros)
| index | train | test | |
|---|---|---|---|
| 0 | Accuracy | 1.0 | 0.864754 |
| 1 | Recall | 1.0 | 0.879573 |
| 2 | Precision | 1.0 | 0.863678 |
| 3 | F1-score | 1.0 | 0.863160 |
| 4 | roc_auc_score | 1.0 | 0.863678 |
Classification report on training data=================================
precision recall f1-score support
0 1.00 1.00 1.00 1093
1 1.00 1.00 1.00 1101
accuracy 1.00 2194
macro avg 1.00 1.00 1.00 2194
weighted avg 1.00 1.00 1.00 2194
Classification report on test data=================================
precision recall f1-score support
0 0.81 0.96 0.88 370
1 0.95 0.77 0.85 362
accuracy 0.86 732
macro avg 0.88 0.86 0.86 732
weighted avg 0.88 0.86 0.86 732
Top 5 features are************************************************
59 0.171870
200 0.135276
64 0.125750
460 0.123556
122 0.092750
dtype: float64
DecisionTreeClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(random_state=1)
Performance is better on decision tree with RandomOver sample dataset
5.B Use cross validation techniques.
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
def cross_validation(n_split,ml, xtrain, ytrain):
kf = KFold(n_splits=n_split, shuffle=True, random_state=0)
scores = cross_val_score(ml, xtrain, ytrain, cv=kf, scoring='accuracy')
print("Cross-validation scoreson train data:", scores)
mean_accuracy = np.mean(scores)
print("Mean accuracy on traindata:", mean_accuracy)
print("Standard Deviation:", np.std(scores))
Basic cross validation
dtree1= DecisionTreeClassifier(random_state=1)
# Cross Validation on random under sampler
print("****Cross Validation on random under sampler****")
cross_validation(5,dtree1,X_train_rus_scaled, y_train_rus)
print("****Cross Validation on random over sampler****")
# Cross Validation on random over sampler
cross_validation(5,dtree1,X_train_ros_scaled, y_train_ros)
****Cross Validation on random under sampler**** Cross-validation scoreson train data: [0.71875 0.51612903 0.5483871 0.48387097 0.67741935] Mean accuracy on traindata: 0.5889112903225806 Standard Deviation: 0.09237391919221724 ****Cross Validation on random over sampler**** Cross-validation scoreson train data: [0.92710706 0.97266515 0.9498861 0.95899772 0.95205479] Mean accuracy on traindata: 0.9521421661934035 Standard Deviation: 0.0148327026890827
Stratified Kfold Using Scikit-Learn
from sklearn.model_selection import StratifiedKFold
def stratified_fold_fun(n_splits_count,ml,xtrain, ytrain):
stratified_kfold = StratifiedKFold(n_splits=n_splits_count, shuffle=True, random_state=1)
scores = cross_val_score(ml, xtrain, ytrain, cv=stratified_kfold, scoring='accuracy')
print("Cross-validation scores:", scores)
mean_accuracy = np.mean(scores)
print("Mean accuracy:", mean_accuracy)
print("Standard Deviation:", np.std(scores))
## Stratified Cross Validation on random under sampler
print("**** Stratified Cross Validation on random under sampler****")
stratified_fold_fun(5,dtree1,X_train_rus_scaled, y_train_rus)
## Stratified Cross Validation on random over sampler
print("**** Stratified Cross Validation on random over sampler****")
stratified_fold_fun(5,dtree1,X_train_ros_scaled, y_train_ros)
**** Stratified Cross Validation on random under sampler**** Cross-validation scores: [0.59375 0.58064516 0.58064516 0.64516129 0.74193548] Mean accuracy: 0.6284274193548386 Standard Deviation: 0.061535217448810756 **** Stratified Cross Validation on random over sampler**** Cross-validation scores: [0.95216401 0.95899772 0.95216401 0.94305239 0.96575342] Mean accuracy: 0.9544263113551971 Standard Deviation: 0.007600044221755339
LeaveOneOut
from sklearn.model_selection import LeaveOneOut
def loocv_fun(ml,xtrain, ytrain):
loocv = LeaveOneOut()
scores =cross_val_score(ml,xtrain, ytrain, cv=loocv,scoring='accuracy')
mean_accuracy = np.mean(scores)
print("Mean accuracy:", mean_accuracy)
print("Standard Deviation:", np.std(scores))
dtree1= DecisionTreeClassifier(random_state=1)
## LOOCV Cross Validation on random under sampler
print("**** LOOCV Cross Validation on random under sampler****")
loocv_fun(dtree1,X_train_rus_scaled, y_train_rus)
## LOOCV Cross Validation on random over sampler
print("**** LOOCV Cross Validation on random over sampler****")
loocv_fun(dtree1,X_train_ros_scaled, y_train_ros)
**** LOOCV Cross Validation on random under sampler**** Mean accuracy: 0.6089743589743589 Standard Deviation: 0.4879801113632886 **** LOOCV Cross Validation on random over sampler**** Mean accuracy: 0.9658158614402917 Standard Deviation: 0.18170190761419916
Analysis:
Accuracy is improved on LOOCV cross validation method for both Random Under sample dataset and Random Over sample dataset but the standard deviation is also very high in LOOCV.
I will choose Stratified Cross Validation , as it has good accuracy mean with low standard deviation.
5.C. Apply hyper-parameter tuning techniques to get the best accuracy
def grid_search_dtree(xtrain,xtest,ytrain,ytest):
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
'ccp_alpha': [0.1, .01, .001],
'max_depth' : list(range(1,20)),
'criterion' :['gini', 'entropy',"log_loss"],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
dtree_grid = DecisionTreeClassifier(random_state=1)
dtree_grid = GridSearchCV(estimator=dtree_grid, param_grid=param_grid, cv=5, verbose=True)
dtree_grid .fit(xtrain, ytrain)
print(dtree_grid.best_params_)
print(dtree_grid.best_estimator_)
complete_analysis(xtrain,xtest,ytrain,ytest,dtree_grid)
dtree_view(dtree_grid.best_estimator_,xtrain)
print("Top 5 features are************************************************")
feature_scores = pd.Series(dtree_grid.best_estimator_.feature_importances_, index=xtrain.columns).sort_values(ascending=False)
print(feature_scores.head())
return dtree_grid
grid_search_dtree(X_train_ros_scaled, X_test_ros_scaled, y_train_ros, y_test_ros)
Fitting 5 folds for each of 4617 candidates, totalling 23085 fits
{'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 19, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=19,
max_features='auto', random_state=1)
| index | train | test | |
|---|---|---|---|
| 0 | Accuracy | 1.0 | 0.837432 |
| 1 | Recall | 1.0 | 0.847419 |
| 2 | Precision | 1.0 | 0.836501 |
| 3 | F1-score | 1.0 | 0.835974 |
| 4 | roc_auc_score | 1.0 | 0.836501 |
Classification report on training data=================================
precision recall f1-score support
0 1.00 1.00 1.00 1093
1 1.00 1.00 1.00 1101
accuracy 1.00 2194
macro avg 1.00 1.00 1.00 2194
weighted avg 1.00 1.00 1.00 2194
Classification report on test data=================================
precision recall f1-score support
0 0.79 0.92 0.85 370
1 0.90 0.75 0.82 362
accuracy 0.84 732
macro avg 0.85 0.84 0.84 732
weighted avg 0.85 0.84 0.84 732
Top 5 features are************************************************
28 0.151516
59 0.130922
316 0.122585
200 0.105729
460 0.104343
dtype: float64
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
param_grid={'ccp_alpha': [0.1, 0.01, 0.001],
'criterion': ['gini', 'entropy', 'log_loss'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10]},
verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
param_grid={'ccp_alpha': [0.1, 0.01, 0.001],
'criterion': ['gini', 'entropy', 'log_loss'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10]},
verbose=True)DecisionTreeClassifier(random_state=1)
DecisionTreeClassifier(random_state=1)
grid_search_dtree(X_train_rus_scaled, X_test_rus_scaled, y_train_rus, y_test_rus)
Fitting 5 folds for each of 4617 candidates, totalling 23085 fits
{'ccp_alpha': 0.01, 'criterion': 'gini', 'max_depth': 7, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}
DecisionTreeClassifier(ccp_alpha=0.01, max_depth=7, max_features='auto',
min_samples_leaf=4, min_samples_split=10,
random_state=1)
| index | train | test | |
|---|---|---|---|
| 0 | Accuracy | 0.833333 | 0.442308 |
| 1 | Recall | 0.835062 | 0.449405 |
| 2 | Precision | 0.835947 | 0.447773 |
| 3 | F1-score | 0.833306 | 0.440445 |
| 4 | roc_auc_score | 0.835947 | 0.447773 |
Classification report on training data=================================
precision recall f1-score support
0 0.88 0.80 0.84 83
1 0.79 0.88 0.83 73
accuracy 0.83 156
macro avg 0.84 0.84 0.83 156
weighted avg 0.84 0.83 0.83 156
Classification report on test data=================================
precision recall f1-score support
0 0.36 0.48 0.41 21
1 0.54 0.42 0.47 31
accuracy 0.44 52
macro avg 0.45 0.45 0.44 52
weighted avg 0.47 0.44 0.45 52
Top 5 features are************************************************
64 0.287066
200 0.180245
316 0.144408
59 0.133267
295 0.114281
dtype: float64
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
param_grid={'ccp_alpha': [0.1, 0.01, 0.001],
'criterion': ['gini', 'entropy', 'log_loss'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10]},
verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
param_grid={'ccp_alpha': [0.1, 0.01, 0.001],
'criterion': ['gini', 'entropy', 'log_loss'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10]},
verbose=True)DecisionTreeClassifier(random_state=1)
DecisionTreeClassifier(random_state=1)
from sklearn.model_selection import RandomizedSearchCV
def random_search_dtree(xtrain,xtest,ytrain,ytest):
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
'ccp_alpha': [0.1, .01, .001],
'max_depth' : list(range(1,20)),
'criterion' :['gini', 'entropy',"log_loss"],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
dtree_rm = DecisionTreeClassifier(random_state=1)
dtree_rm = RandomizedSearchCV(estimator=dtree_rm, param_distributions=param_grid, cv=5, verbose=True)
dtree_rm .fit(xtrain, ytrain)
print(dtree_rm.best_params_)
print(dtree_rm.best_estimator_)
complete_analysis(xtrain,xtest,ytrain,ytest,dtree_rm)
dtree_view(dtree_rm.best_estimator_,xtrain)
print("Top 5 features are************************************************")
feature_scores = pd.Series(dtree_rm.best_estimator_.feature_importances_, index=xtrain.columns).sort_values(ascending=False)
print(feature_scores.head())
return dtree_rm
random_search_dtree(X_train_ros_scaled, X_test_ros_scaled, y_train_ros, y_test_ros)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 18, 'criterion': 'entropy', 'ccp_alpha': 0.001}
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=18,
max_features='log2', min_samples_split=10,
random_state=1)
| index | train | test | |
|---|---|---|---|
| 0 | Accuracy | 0.994075 | 0.834699 |
| 1 | Recall | 0.994165 | 0.845269 |
| 2 | Precision | 0.994053 | 0.833739 |
| 3 | F1-score | 0.994074 | 0.833130 |
| 4 | roc_auc_score | 0.994053 | 0.833739 |
Classification report on training data=================================
precision recall f1-score support
0 1.00 0.99 0.99 1093
1 0.99 1.00 0.99 1101
accuracy 0.99 2194
macro avg 0.99 0.99 0.99 2194
weighted avg 0.99 0.99 0.99 2194
Classification report on test data=================================
precision recall f1-score support
0 0.79 0.92 0.85 370
1 0.90 0.75 0.82 362
accuracy 0.83 732
macro avg 0.85 0.83 0.83 732
weighted avg 0.84 0.83 0.83 732
Top 5 features are************************************************
200 0.154357
59 0.150559
129 0.116408
28 0.107659
460 0.100061
dtype: float64
RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
param_distributions={'ccp_alpha': [0.1, 0.01, 0.001],
'criterion': ['gini', 'entropy',
'log_loss'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15,
16, 17, 18, 19],
'max_features': ['auto', 'sqrt',
'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10]},
verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
param_distributions={'ccp_alpha': [0.1, 0.01, 0.001],
'criterion': ['gini', 'entropy',
'log_loss'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15,
16, 17, 18, 19],
'max_features': ['auto', 'sqrt',
'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10]},
verbose=True)DecisionTreeClassifier(random_state=1)
DecisionTreeClassifier(random_state=1)
random_search_dtree(X_train_rus_scaled, X_test_rus_scaled, y_train_rus, y_test_rus)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
{'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 5, 'criterion': 'entropy', 'ccp_alpha': 0.001}
DecisionTreeClassifier(ccp_alpha=0.001, criterion='entropy', max_depth=5,
max_features='sqrt', random_state=1)
| index | train | test | |
|---|---|---|---|
| 0 | Accuracy | 0.826923 | 0.538462 |
| 1 | Recall | 0.826137 | 0.550225 |
| 2 | Precision | 0.826622 | 0.551459 |
| 3 | F1-score | 0.826345 | 0.537778 |
| 4 | roc_auc_score | 0.826622 | 0.551459 |
Classification report on training data=================================
precision recall f1-score support
0 0.84 0.83 0.84 83
1 0.81 0.82 0.82 73
accuracy 0.83 156
macro avg 0.83 0.83 0.83 156
weighted avg 0.83 0.83 0.83 156
Classification report on test data=================================
precision recall f1-score support
0 0.45 0.62 0.52 21
1 0.65 0.48 0.56 31
accuracy 0.54 52
macro avg 0.55 0.55 0.54 52
weighted avg 0.57 0.54 0.54 52
Top 5 features are************************************************
316 0.245399
59 0.173167
200 0.152714
28 0.112035
129 0.106595
dtype: float64
RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
param_distributions={'ccp_alpha': [0.1, 0.01, 0.001],
'criterion': ['gini', 'entropy',
'log_loss'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15,
16, 17, 18, 19],
'max_features': ['auto', 'sqrt',
'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10]},
verbose=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=1),
param_distributions={'ccp_alpha': [0.1, 0.01, 0.001],
'criterion': ['gini', 'entropy',
'log_loss'],
'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15,
16, 17, 18, 19],
'max_features': ['auto', 'sqrt',
'log2'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10]},
verbose=True)DecisionTreeClassifier(random_state=1)
DecisionTreeClassifier(random_state=1)
Analysis:
Use any other technique/method which can enhance the model performance Display and explain the classification report in detail.
pca=PCA(n_components=10,random_state=10)
pca_model=pca.fit_transform(X_train_ros_scaled)
#calculate the variance
var_explained_per=pca.explained_variance_/np.sum(pca.explained_variance_)
print("Variance_explained_variance=",var_explained_per)
#Cummulative Sum
cum_var_explained=np.cumsum(var_explained_per)
print("Cummuative_vaiance_explained=",cum_var_explained)
plt.plot(cum_var_explained,marker='*',markerfacecolor='black', markersize=8)
plt.axhline(y = .90)
plt.xlabel('n_components')
plt.ylabel('Cummuative_vaiance_explained')
plt.show()
Variance_explained_variance= [0.16114244 0.12744209 0.11604669 0.10748442 0.09961551 0.09512159 0.08998848 0.07857 0.06506174 0.05952703] Cummuative_vaiance_explained= [0.16114244 0.28858453 0.40463122 0.51211564 0.61173115 0.70685274 0.79684123 0.87541122 0.94047297 1. ]
pca=PCA(n_components=7, random_state=10)
pca_model_train=pca.fit_transform(X_train_ros_scaled)
def data_analysis(y,predicted_x):
result=performance_analysis(y,predicted_x)
data= { 'performance' : result,
}
Name= ['Accuracy', 'Recall', 'Precision', 'F1-score',"roc_auc_score"]
index=Name
df2 = pd.DataFrame(data, index)
df2.reset_index(inplace = True)
display(df2)
dtree_pca = DecisionTreeClassifier(random_state=1)
def pca_fun(ML,xtrain_scaled,ytrain,xtest,ytest):
pca=PCA(n_components=7, random_state=10)
pca_model_train=pca.fit_transform(xtrain_scaled)
ML.fit(pca_model_train,ytrain)
ML_model = ML.predict(pca_model_train)
print(classification_report(ytrain,ML_model))
data_analysis(ytrain,ML_model)
conf_metrix(ytrain,ML_model)
print("############")
pca_model_test=pca.transform(xtest)
dtree_pca_model2 = ML.predict(pca_model_test)
print(classification_report(ytest,dtree_pca_model2))
data_analysis(ytest,dtree_pca_model2)
conf_metrix(ytest,dtree_pca_model2)
pca_fun(dtree_pca,X_train_ros_scaled,y_train_ros,X_test_ros_scaled,y_test_ros)
precision recall f1-score support
0 1.00 1.00 1.00 1093
1 1.00 1.00 1.00 1101
accuracy 1.00 2194
macro avg 1.00 1.00 1.00 2194
weighted avg 1.00 1.00 1.00 2194
| index | performance | |
|---|---|---|
| 0 | Accuracy | 1.0 |
| 1 | Recall | 1.0 |
| 2 | Precision | 1.0 |
| 3 | F1-score | 1.0 |
| 4 | roc_auc_score | 1.0 |
############
precision recall f1-score support
0 0.71 0.95 0.81 370
1 0.92 0.60 0.72 362
accuracy 0.77 732
macro avg 0.81 0.77 0.77 732
weighted avg 0.81 0.77 0.77 732
| index | performance | |
|---|---|---|
| 0 | Accuracy | 0.773224 |
| 1 | Recall | 0.810450 |
| 2 | Precision | 0.771316 |
| 3 | F1-score | 0.765361 |
| 4 | roc_auc_score | 0.771316 |
pca_fun(dtree_pca,X_train_rus_scaled,y_train_rus,X_test_rus_scaled,y_test_rus)
precision recall f1-score support
0 1.00 1.00 1.00 83
1 1.00 1.00 1.00 73
accuracy 1.00 156
macro avg 1.00 1.00 1.00 156
weighted avg 1.00 1.00 1.00 156
| index | performance | |
|---|---|---|
| 0 | Accuracy | 1.0 |
| 1 | Recall | 1.0 |
| 2 | Precision | 1.0 |
| 3 | F1-score | 1.0 |
| 4 | roc_auc_score | 1.0 |
############
precision recall f1-score support
0 0.48 0.62 0.54 21
1 0.68 0.55 0.61 31
accuracy 0.58 52
macro avg 0.58 0.58 0.57 52
weighted avg 0.60 0.58 0.58 52
| index | performance | |
|---|---|---|
| 0 | Accuracy | 0.576923 |
| 1 | Recall | 0.580741 |
| 2 | Precision | 0.583717 |
| 3 | F1-score | 0.574405 |
| 4 | roc_auc_score | 0.583717 |
Classification report Analysis:
Apply the above steps for all possible models that you have learnt so far.
def stratified_fold_fun2(n_splits_count,ml,xtrain, ytrain):
stratified_kfold = StratifiedKFold(n_splits=n_splits_count, shuffle=True, random_state=1)
scores = cross_val_score(ml, xtrain, ytrain, cv=stratified_kfold, scoring='accuracy')
mean_accuracy = np.mean(scores)
return mean_accuracy
models = []
models.append(("LR", LogisticRegression()))
models.append(("KNN", KNeighborsClassifier()))
models.append(("SVM", SVC(kernel='sigmoid')))
models.append(("DT", DecisionTreeClassifier()))
models.append(("RF", RandomForestClassifier()))
models.append(("AB", AdaBoostClassifier()))
models.append(("GBT", GradientBoostingClassifier()))
df_S = pd.DataFrame(columns=['Model', 'mean accuracy_train_RUS','mean accuracy_train_ROS'])
#testing models
results = []
names = []
for name, model in models:
a=stratified_fold_fun2(5,model,X_train_rus_scaled, y_train_rus)
results.append(a)
names.append(name)
mean_accuracy_rus = a.mean()
b = stratified_fold_fun2(5, model, X_train_ros_scaled, y_train_ros)
mean_accuracy_ros = b.mean()
df_S = df_S.append({'Model': name, 'mean accuracy_train_RUS': mean_accuracy_rus, 'mean accuracy_train_ROS': mean_accuracy_ros}, ignore_index=True)
# Display the DataFrame
df_S
| Model | mean accuracy_train_RUS | mean accuracy_train_ROS | |
|---|---|---|---|
| 0 | LR | 0.640323 | 0.651309 |
| 1 | KNN | 0.538105 | 0.889247 |
| 2 | SVM | 0.634274 | 0.519572 |
| 3 | DT | 0.621976 | 0.957615 |
| 4 | RF | 0.633669 | 0.986783 |
| 5 | AB | 0.551008 | 0.820427 |
| 6 | GBT | 0.570363 | 0.914311 |
def summary_table2(models,x,y):
df_S1 = pd.DataFrame(index=['Accuracy', 'Recall', 'Precision', 'F1-score',"roc_auc_score"])
for i in models:
y=performance_analysis(y,i.predict(x))
df_S1 [str(i)[0:9]+"result"]=y
return df_S1
6.A. Display and compare all the models designed with their train and test accuracies.
print("*****************PCA on multiple algos o Random Over Sample TEST data set")
models = [
("LR", LogisticRegression(random_state=1)),
("KNN", KNeighborsClassifier()),
("SVM", SVC(kernel='sigmoid')),
("DT", DecisionTreeClassifier(random_state=1)),
("RF", RandomForestClassifier(random_state=1)),
("AB", AdaBoostClassifier(random_state=1)),
("GBT", GradientBoostingClassifier(random_state=1))
]
# Create a DataFrame to store the results
df_S = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score'])
# Define a function for PCA and model training
def pca_fun_test(ML, xtrain_scaled, ytrain, xtest, ytest):
pca = PCA(n_components=7, random_state=1)
pca_model_train = pca.fit_transform(xtrain_scaled)
ML.fit(pca_model_train, ytrain)
pca_model_test = pca.transform(xtest)
ML_model = ML.predict(pca_model_test)
# Calculate evaluation metrics
accuracy = accuracy_score(ytest, ML_model)
recall = recall_score(ytest, ML_model)
precision = precision_score(ytest, ML_model)
f1 = f1_score(ytest, ML_model)
roc_auc = roc_auc_score(ytest, ML_model)
return [accuracy, recall, precision, f1, roc_auc]
def pca_fun_train(ML, xtrain_scaled, ytrain, xtest, ytest):
pca = PCA(n_components=7, random_state=10)
pca_model_train = pca.fit_transform(xtrain_scaled)
ML.fit(pca_model_train, ytrain)
ML_model = ML.predict(pca_model_train)
# Calculate evaluation metrics
accuracy = accuracy_score(ytrain, ML_model)
recall = recall_score(ytrain, ML_model)
precision = precision_score(ytrain, ML_model)
f1 = f1_score(ytrain, ML_model)
roc_auc = roc_auc_score(ytrain, ML_model)
return [accuracy, recall, precision, f1, roc_auc]
# Testing models
results = []
names = []
for name, model in models:
result = pca_fun_test(model,X_train_ros_scaled,y_train_ros,X_test_ros_scaled,y_test_ros)
results.append(result)
names.append(name)
# Append the results to the DataFrame
df_S['Model'] = names
df_S[['Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score']] = results
# Display the DataFrame
print(df_S)
#
results.clear()
names.clear()
print("*****************PCA on multiple algos o Random Under Search on Test dataset")
df_S2 = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score'])
for name, model in models:
result = pca_fun_test(model,X_train_rus_scaled,y_train_rus,X_test_rus_scaled,y_test_rus)
results.append(result)
names.append(name)
# Append the results to the DataFrame
df_S2['Model'] = names
df_S2[['Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score']] = results
# Display the DataFrame
print(df_S2)
print("*****************PCA on multiple algos on Random Over Search on Train dataset")
results.clear()
names.clear()
df_S3 = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score'])
for name, model in models:
result = pca_fun_train(model,X_train_rus_scaled,y_train_rus,X_test_rus_scaled,y_test_rus)
results.append(result)
names.append(name)
# Append the results to the DataFrame
df_S3['Model'] = names
df_S3[['Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score']] = results
# Display the DataFrame
print(df_S3)
print("*****************PCA on multiple algos on Random Over Search on Train dataset")
results.clear()
names.clear()
df_S4 = pd.DataFrame(columns=['Model', 'Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score'])
for name, model in models:
result = pca_fun_train(model,X_train_ros_scaled,y_train_ros,X_test_ros_scaled,y_test_ros)
results.append(result)
names.append(name)
# Append the results to the DataFrame
df_S4['Model'] = names
df_S4[['Accuracy', 'Recall', 'Precision', 'F1-score', 'roc_auc_score']] = results
# Display the DataFrame
print(df_S4)
*****************PCA on multiple algos o Random Over Sample TEST data set Model Accuracy Recall Precision F1-score roc_auc_score 0 LR 0.659836 0.676796 0.649867 0.663058 0.660019 1 KNN 0.894809 1.000000 0.824601 0.903870 0.895946 2 SVM 0.545082 0.511050 0.542522 0.526316 0.544714 3 DT 0.773224 0.596685 0.915254 0.722408 0.771316 4 RF 0.931694 0.864641 0.996815 0.926036 0.930969 5 AB 0.733607 0.676796 0.758514 0.715328 0.732992 6 GBT 0.763661 0.704420 0.794393 0.746706 0.763021 *****************PCA on multiple algos o Random Under Search on Test dataset Model Accuracy Recall Precision F1-score roc_auc_score 0 LR 0.653846 0.548387 0.809524 0.653846 0.678955 1 KNN 0.461538 0.483871 0.555556 0.517241 0.456221 2 SVM 0.653846 0.516129 0.842105 0.640000 0.686636 3 DT 0.576923 0.548387 0.680000 0.607143 0.583717 4 RF 0.576923 0.516129 0.695652 0.592593 0.591398 5 AB 0.500000 0.387097 0.631579 0.480000 0.526882 6 GBT 0.615385 0.580645 0.720000 0.642857 0.623656 *****************PCA on multiple algos on Random Over Search on Train dataset Model Accuracy Recall Precision F1-score roc_auc_score 0 LR 0.679487 0.616438 0.671642 0.642857 0.675689 1 KNN 0.679487 0.698630 0.645570 0.671053 0.680640 2 SVM 0.589744 0.547945 0.563380 0.555556 0.587226 3 DT 1.000000 1.000000 1.000000 1.000000 1.000000 4 RF 1.000000 1.000000 1.000000 1.000000 1.000000 5 AB 0.916667 0.890411 0.928571 0.909091 0.915085 6 GBT 1.000000 1.000000 1.000000 1.000000 1.000000 *****************PCA on multiple algos on Random Over Search on Train dataset Model Accuracy Recall Precision F1-score roc_auc_score 0 LR 0.667730 0.683015 0.664311 0.673533 0.667674 1 KNN 0.927074 1.000000 0.873117 0.932261 0.926807 2 SVM 0.520966 0.495005 0.524038 0.509108 0.521061 3 DT 1.000000 1.000000 1.000000 1.000000 1.000000 4 RF 1.000000 1.000000 1.000000 1.000000 1.000000 5 AB 0.813582 0.833787 0.802448 0.817817 0.813509 6 GBT 0.941203 0.988193 0.903654 0.944035 0.941031
Using only RandomOver Sample dataset and will perform pca+grid search
models = [
("KNN", KNeighborsClassifier()),
("SVM", SVC(kernel='sigmoid')),
("DT", DecisionTreeClassifier(random_state=1)),
("RF", RandomForestClassifier(random_state=1)),
("AB", AdaBoostClassifier(random_state=1)),
("GBT", GradientBoostingClassifier(random_state=1))
]
pca = PCA(n_components=7, random_state=1)
results_train = {
"Model": [],
"Accuracy": [],
"Recall": [],
"Precision": [],
"F1-score": [],
"roc_auc_score": []
}
results_test = {
"Model": [],
"Accuracy": [],
"Recall": [],
"Precision": [],
"F1-score": [],
"roc_auc_score": []
}
for name, model in models:
X_pca_train = pca.fit_transform(X_train_ros_scaled)
param_grids = {
"KNN": [
{
"n_neighbors": [3, 5, 7, 9],
"weights": ["uniform", "distance"]
}
],
"SVM": [
{
"C": [0.01, 0.1, 1],
"kernel": ["linear", "poly", "rbf", "sigmoid"],
"gamma": [ 0.01, 0.1, 1]
}
],
"DT": [
{
"criterion": ["gini", "entropy"],
"max_depth": [10, 20, 30, 40],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4]
}
],
"RF": [
{
"n_estimators": [100, 200],
"criterion": ["gini", "entropy"],
"max_depth": [10, 20, 30],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4]
}
],
"AB": [
{
"n_estimators": [50, 100],
"learning_rate": [0.01, 0.1, 1.0]
}
],
"GBT": [
{
"n_estimators": [50, 100, 200],
"learning_rate": [0.01, 0.1, 0.5]
}
]
}
grid_search = GridSearchCV(model, param_grids[name], cv=5)
grid_search.fit(X_pca_train, y_train_ros)
best_model = grid_search.best_estimator_
accuracy_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='accuracy').mean()
recall_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='recall').mean()
precision_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='precision').mean()
f1_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='f1').mean()
roc_auc_train = cross_val_score(best_model, X_pca_train, y_train_ros, cv=5, scoring='roc_auc').mean()
# Apply PCA to the test data
X_pca_test = pca.transform(X_test_ros_scaled)
y_pred_test = best_model.predict(X_pca_test)
accuracy_test = accuracy_score(y_test_ros, y_pred_test)
recall_test = recall_score(y_test_ros, y_pred_test)
precision_test = precision_score(y_test_ros, y_pred_test)
f1_test = f1_score(y_test_ros, y_pred_test)
roc_auc_test = roc_auc_score(y_test_ros, y_pred_test)
results_train["Model"].append(name)
results_train["Accuracy"].append(accuracy_train)
results_train["Recall"].append(recall_train)
results_train["Precision"].append(precision_train)
results_train["F1-score"].append(f1_train)
results_train["roc_auc_score"].append(roc_auc_train)
results_test["Model"].append(name)
results_test["Accuracy"].append(accuracy_test)
results_test["Recall"].append(recall_test)
results_test["Precision"].append(precision_test)
results_test["F1-score"].append(f1_test)
results_test["roc_auc_score"].append(roc_auc_test)
results_train_df = pd.DataFrame(results_train)
results_test_df = pd.DataFrame(results_test)
print("Results for Train Data:")
print(results_train_df)
print("\nResults for Test Data:")
print(results_test_df)
Results for Train Data: Model Accuracy Recall Precision F1-score roc_auc_score 0 KNN 0.908387 1.000000 0.845755 0.916393 0.947843 1 SVM 0.988603 1.000000 0.977867 0.988792 1.000000 2 DT 0.954418 1.000000 0.917510 0.956790 0.955960 3 RF 0.989056 1.000000 0.978828 0.989259 1.000000 4 AB 0.832741 0.872834 0.808815 0.839350 0.908117 5 GBT 0.965361 1.000000 0.935931 0.966775 0.997194 Results for Test Data: Model Accuracy Recall Precision F1-score roc_auc_score 0 KNN 0.924863 1.000000 0.868106 0.929397 0.925676 1 SVM 0.990437 1.000000 0.981030 0.990424 0.990541 2 DT 0.758197 0.582873 0.890295 0.704508 0.756301 3 RF 0.927596 0.859116 0.993610 0.921481 0.926855 4 AB 0.733607 0.660221 0.768489 0.710253 0.732813 5 GBT 0.748634 0.541436 0.915888 0.680556 0.746394
Post Training and Conclusion
6.B Select the final best trained model along with your detailed comments for selecting this model.
Analysis:
6.C Pickle the selected model for future use.
def model2():
rfc=RandomForestClassifier()
pca_fun_test(rfc,X_train_ros_scaled,y_train_ros,X_test_ros_scaled,y_test_ros)
pca = PCA(n_components=7, random_state=10)
pca_model_train = pca.fit_transform(X_train_ros_scaled)
rfc=rfc.fit(pca_model_train, y_train_ros)
pca_model_test = pca.transform(X_test_ros_scaled)
rfc.predict(pca_model_test)
return rfc
with open('rfc_model.pkl', 'wb') as model_file:
pickle.dump(model2, model_file)
with open('rfc_model.pkl', 'rb') as model_file:
loaded_rfc = pickle.load(model_file)
D. Write your conclusion on the results.